# Importing of the libraries
import re
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from wordcloud import WordCloud
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.sentiment import SentimentIntensityAnalyzer
df_train = pd.read_csv("C:/Users/SLIM5/Documents/Corona_NLP_train.csv",encoding = "ISO-8859-1")
df_test = pd.read_csv("C:/Users/SLIM5/Documents/Corona_NLP_test.csv")
df_train.head()
| UserName | ScreenName | Location | TweetAt | OriginalTweet | Sentiment | |
|---|---|---|---|---|---|---|
| 0 | 3799 | 48751 | London | 16-03-2020 | @MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i... | Neutral |
| 1 | 3800 | 48752 | UK | 16-03-2020 | advice Talk to your neighbours family to excha... | Positive |
| 2 | 3801 | 48753 | Vagabonds | 16-03-2020 | Coronavirus Australia: Woolworths to give elde... | Positive |
| 3 | 3802 | 48754 | NaN | 16-03-2020 | My food stock is not the only one which is emp... | Positive |
| 4 | 3803 | 48755 | NaN | 16-03-2020 | Me, ready to go at supermarket during the #COV... | Extremely Negative |
df_test.head()
| UserName | ScreenName | Location | TweetAt | OriginalTweet | Sentiment | |
|---|---|---|---|---|---|---|
| 0 | 1 | 44953 | NYC | 02-03-2020 | TRENDING: New Yorkers encounter empty supermar... | Extremely Negative |
| 1 | 2 | 44954 | Seattle, WA | 02-03-2020 | When I couldn't find hand sanitizer at Fred Me... | Positive |
| 2 | 3 | 44955 | NaN | 02-03-2020 | Find out how you can protect yourself and love... | Extremely Positive |
| 3 | 4 | 44956 | Chicagoland | 02-03-2020 | #Panic buying hits #NewYork City as anxious sh... | Negative |
| 4 | 5 | 44957 | Melbourne, Victoria | 03-03-2020 | #toiletpaper #dunnypaper #coronavirus #coronav... | Neutral |
df_train.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 41157 entries, 0 to 41156 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 UserName 41157 non-null int64 1 ScreenName 41157 non-null int64 2 Location 32567 non-null object 3 TweetAt 41157 non-null object 4 OriginalTweet 41157 non-null object 5 Sentiment 41157 non-null object dtypes: int64(2), object(4) memory usage: 1.9+ MB
df_tr=df_train.fillna(" ")
df_tr
| UserName | ScreenName | Location | TweetAt | OriginalTweet | Sentiment | |
|---|---|---|---|---|---|---|
| 0 | 3799 | 48751 | London | 16-03-2020 | @MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i... | Neutral |
| 1 | 3800 | 48752 | UK | 16-03-2020 | advice Talk to your neighbours family to excha... | Positive |
| 2 | 3801 | 48753 | Vagabonds | 16-03-2020 | Coronavirus Australia: Woolworths to give elde... | Positive |
| 3 | 3802 | 48754 | 16-03-2020 | My food stock is not the only one which is emp... | Positive | |
| 4 | 3803 | 48755 | 16-03-2020 | Me, ready to go at supermarket during the #COV... | Extremely Negative | |
| ... | ... | ... | ... | ... | ... | ... |
| 41152 | 44951 | 89903 | Wellington City, New Zealand | 14-04-2020 | Airline pilots offering to stock supermarket s... | Neutral |
| 41153 | 44952 | 89904 | 14-04-2020 | Response to complaint not provided citing COVI... | Extremely Negative | |
| 41154 | 44953 | 89905 | 14-04-2020 | You know itÂs getting tough when @KameronWild... | Positive | |
| 41155 | 44954 | 89906 | 14-04-2020 | Is it wrong that the smell of hand sanitizer i... | Neutral | |
| 41156 | 44955 | 89907 | i love you so much || he/him | 14-04-2020 | @TartiiCat Well new/used Rift S are going for ... | Negative |
41157 rows × 6 columns
df_tr.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 41157 entries, 0 to 41156 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 UserName 41157 non-null int64 1 ScreenName 41157 non-null int64 2 Location 41157 non-null object 3 TweetAt 41157 non-null object 4 OriginalTweet 41157 non-null object 5 Sentiment 41157 non-null object dtypes: int64(2), object(4) memory usage: 1.9+ MB
print(df_tr["Sentiment"].value_counts())
plt.figure(figsize=(10, 5))
sns.countplot(x ="Sentiment",data=df_tr, palette="Set1")
plt.title(" Sentiment of Tweets")
plt.xticks([0,1,2,3,4],["Extremely Positive","Positive","Neutral", "Negative", "Extremely Negative"],)
plt.show()
Positive 11422 Negative 9917 Neutral 7713 Extremely Positive 6624 Extremely Negative 5481 Name: Sentiment, dtype: int64
px.histogram(df_tr, x="TweetAt", nbins=100, title="TweetAt")
import re
import string
# cleaning numbers substitute by " "
df_tr['OriginalTweet']=[re.sub('\d',"",i) for i in df_tr['OriginalTweet']] # regex for numbers
# cleaning special characters substitute by space
df_tr['OriginalTweet']=[re.sub('[%s]' % re.escape(string.punctuation),' ',i) for i in df_tr['OriginalTweet']]
# making text character to lowercase
df_tr['OriginalTweet']=[i.lower() for i in df_tr['OriginalTweet']]
df_tr.head(10)
| UserName | ScreenName | Location | TweetAt | OriginalTweet | Sentiment | |
|---|---|---|---|---|---|---|
| 0 | 3799 | 48751 | London | 16-03-2020 | menyrbie phil gahan chrisitv https t co i... | Neutral |
| 1 | 3800 | 48752 | UK | 16-03-2020 | advice talk to your neighbours family to excha... | Positive |
| 2 | 3801 | 48753 | Vagabonds | 16-03-2020 | coronavirus australia woolworths to give elde... | Positive |
| 3 | 3802 | 48754 | 16-03-2020 | my food stock is not the only one which is emp... | Positive | |
| 4 | 3803 | 48755 | 16-03-2020 | me ready to go at supermarket during the cov... | Extremely Negative | |
| 5 | 3804 | 48756 | ÃT: 36.319708,-82.363649 | 16-03-2020 | as news of the regionâs first confirmed covid... | Positive |
| 6 | 3805 | 48757 | 35.926541,-78.753267 | 16-03-2020 | cashier at grocery store was sharing his insig... | Positive |
| 7 | 3806 | 48758 | Austria | 16-03-2020 | was at the supermarket today didn t buy toile... | Neutral |
| 8 | 3807 | 48759 | Atlanta, GA USA | 16-03-2020 | due to covid our retail store and classroom i... | Positive |
| 9 | 3808 | 48760 | BHAVNAGAR,GUJRAT | 16-03-2020 | for corona prevention we should stop to buy th... | Negative |
import nltk
# To tokenize by word and creating a new column
#nltk.download('punkt')
from nltk.tokenize import word_tokenize
df_tr['text_wt']= [word_tokenize(i) for i in df_tr['OriginalTweet']]
df_tr
| UserName | ScreenName | Location | TweetAt | OriginalTweet | Sentiment | text_wt | |
|---|---|---|---|---|---|---|---|
| 0 | 3799 | 48751 | London | 16-03-2020 | menyrbie phil gahan chrisitv https t co i... | Neutral | [menyrbie, phil, gahan, chrisitv, https, t, co... |
| 1 | 3800 | 48752 | UK | 16-03-2020 | advice talk to your neighbours family to excha... | Positive | [advice, talk, to, your, neighbours, family, t... |
| 2 | 3801 | 48753 | Vagabonds | 16-03-2020 | coronavirus australia woolworths to give elde... | Positive | [coronavirus, australia, woolworths, to, give,... |
| 3 | 3802 | 48754 | 16-03-2020 | my food stock is not the only one which is emp... | Positive | [my, food, stock, is, not, the, only, one, whi... | |
| 4 | 3803 | 48755 | 16-03-2020 | me ready to go at supermarket during the cov... | Extremely Negative | [me, ready, to, go, at, supermarket, during, t... | |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 41152 | 44951 | 89903 | Wellington City, New Zealand | 14-04-2020 | airline pilots offering to stock supermarket s... | Neutral | [airline, pilots, offering, to, stock, superma... |
| 41153 | 44952 | 89904 | 14-04-2020 | response to complaint not provided citing covi... | Extremely Negative | [response, to, complaint, not, provided, citin... | |
| 41154 | 44953 | 89905 | 14-04-2020 | you know itâs getting tough when kameronwild... | Positive | [you, know, itâs, getting, tough, when, kamer... | |
| 41155 | 44954 | 89906 | 14-04-2020 | is it wrong that the smell of hand sanitizer i... | Neutral | [is, it, wrong, that, the, smell, of, hand, sa... | |
| 41156 | 44955 | 89907 | i love you so much || he/him | 14-04-2020 | tartiicat well new used rift s are going for ... | Negative | [tartiicat, well, new, used, rift, s, are, goi... |
41157 rows × 7 columns
# Calling stopwords library
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
# For removing stopwords from the tokens
df_tr['text_SW']= [[i for i in j if not i in stop_words] for j in df_tr['text_wt']]
df_tr
| UserName | ScreenName | Location | TweetAt | OriginalTweet | Sentiment | text_wt | text_SW | |
|---|---|---|---|---|---|---|---|---|
| 0 | 3799 | 48751 | London | 16-03-2020 | menyrbie phil gahan chrisitv https t co i... | Neutral | [menyrbie, phil, gahan, chrisitv, https, t, co... | [menyrbie, phil, gahan, chrisitv, https, co, i... |
| 1 | 3800 | 48752 | UK | 16-03-2020 | advice talk to your neighbours family to excha... | Positive | [advice, talk, to, your, neighbours, family, t... | [advice, talk, neighbours, family, exchange, p... |
| 2 | 3801 | 48753 | Vagabonds | 16-03-2020 | coronavirus australia woolworths to give elde... | Positive | [coronavirus, australia, woolworths, to, give,... | [coronavirus, australia, woolworths, give, eld... |
| 3 | 3802 | 48754 | 16-03-2020 | my food stock is not the only one which is emp... | Positive | [my, food, stock, is, not, the, only, one, whi... | [food, stock, one, empty, please, panic, enoug... | |
| 4 | 3803 | 48755 | 16-03-2020 | me ready to go at supermarket during the cov... | Extremely Negative | [me, ready, to, go, at, supermarket, during, t... | [ready, go, supermarket, covid, outbreak, para... | |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 41152 | 44951 | 89903 | Wellington City, New Zealand | 14-04-2020 | airline pilots offering to stock supermarket s... | Neutral | [airline, pilots, offering, to, stock, superma... | [airline, pilots, offering, stock, supermarket... |
| 41153 | 44952 | 89904 | 14-04-2020 | response to complaint not provided citing covi... | Extremely Negative | [response, to, complaint, not, provided, citin... | [response, complaint, provided, citing, covid,... | |
| 41154 | 44953 | 89905 | 14-04-2020 | you know itâs getting tough when kameronwild... | Positive | [you, know, itâs, getting, tough, when, kamer... | [know, itâs, getting, tough, kameronwilds, ra... | |
| 41155 | 44954 | 89906 | 14-04-2020 | is it wrong that the smell of hand sanitizer i... | Neutral | [is, it, wrong, that, the, smell, of, hand, sa... | [wrong, smell, hand, sanitizer, starting, turn... | |
| 41156 | 44955 | 89907 | i love you so much || he/him | 14-04-2020 | tartiicat well new used rift s are going for ... | Negative | [tartiicat, well, new, used, rift, s, are, goi... | [tartiicat, well, new, used, rift, going, amaz... |
41157 rows × 8 columns
def lemmatize_text(text):
lemmatizer = nltk.stem.WordNetLemmatizer()
return [lemmatizer.lemmatize(w) for w in text]
df_tr['lemma'] = df_tr['text_SW'].apply(lemmatize_text)
df_tr.head()
| UserName | ScreenName | Location | TweetAt | OriginalTweet | Sentiment | text_wt | text_SW | lemma | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 3799 | 48751 | London | 16-03-2020 | menyrbie phil gahan chrisitv https t co i... | Neutral | [menyrbie, phil, gahan, chrisitv, https, t, co... | [menyrbie, phil, gahan, chrisitv, https, co, i... | [menyrbie, phil, gahan, chrisitv, http, co, if... |
| 1 | 3800 | 48752 | UK | 16-03-2020 | advice talk to your neighbours family to excha... | Positive | [advice, talk, to, your, neighbours, family, t... | [advice, talk, neighbours, family, exchange, p... | [advice, talk, neighbour, family, exchange, ph... |
| 2 | 3801 | 48753 | Vagabonds | 16-03-2020 | coronavirus australia woolworths to give elde... | Positive | [coronavirus, australia, woolworths, to, give,... | [coronavirus, australia, woolworths, give, eld... | [coronavirus, australia, woolworth, give, elde... |
| 3 | 3802 | 48754 | 16-03-2020 | my food stock is not the only one which is emp... | Positive | [my, food, stock, is, not, the, only, one, whi... | [food, stock, one, empty, please, panic, enoug... | [food, stock, one, empty, please, panic, enoug... | |
| 4 | 3803 | 48755 | 16-03-2020 | me ready to go at supermarket during the cov... | Extremely Negative | [me, ready, to, go, at, supermarket, during, t... | [ready, go, supermarket, covid, outbreak, para... | [ready, go, supermarket, covid, outbreak, para... |
# To bring back the data into its original form
df_tr['lemma2']= df_tr['lemma'].apply(lambda x: ' '.join(x))
df_tr.head()
| UserName | ScreenName | Location | TweetAt | OriginalTweet | Sentiment | text_wt | text_SW | lemma | lemma2 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 3799 | 48751 | London | 16-03-2020 | menyrbie phil gahan chrisitv https t co i... | Neutral | [menyrbie, phil, gahan, chrisitv, https, t, co... | [menyrbie, phil, gahan, chrisitv, https, co, i... | [menyrbie, phil, gahan, chrisitv, http, co, if... | menyrbie phil gahan chrisitv http co ifzfanpa ... |
| 1 | 3800 | 48752 | UK | 16-03-2020 | advice talk to your neighbours family to excha... | Positive | [advice, talk, to, your, neighbours, family, t... | [advice, talk, neighbours, family, exchange, p... | [advice, talk, neighbour, family, exchange, ph... | advice talk neighbour family exchange phone nu... |
| 2 | 3801 | 48753 | Vagabonds | 16-03-2020 | coronavirus australia woolworths to give elde... | Positive | [coronavirus, australia, woolworths, to, give,... | [coronavirus, australia, woolworths, give, eld... | [coronavirus, australia, woolworth, give, elde... | coronavirus australia woolworth give elderly d... |
| 3 | 3802 | 48754 | 16-03-2020 | my food stock is not the only one which is emp... | Positive | [my, food, stock, is, not, the, only, one, whi... | [food, stock, one, empty, please, panic, enoug... | [food, stock, one, empty, please, panic, enoug... | food stock one empty please panic enough food ... | |
| 4 | 3803 | 48755 | 16-03-2020 | me ready to go at supermarket during the cov... | Extremely Negative | [me, ready, to, go, at, supermarket, during, t... | [ready, go, supermarket, covid, outbreak, para... | [ready, go, supermarket, covid, outbreak, para... | ready go supermarket covid outbreak paranoid f... |
# Now we have to create the Bag of Words
from sklearn.feature_extraction.text import CountVectorizer
cv= CountVectorizer(max_features=5000)
BOW = cv.fit_transform(df_tr['lemma2']).toarray()
BOW
array([[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
...,
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0]], dtype=int64)
from wordcloud import WordCloud
text = df_tr['lemma2'].values
wordcloud = WordCloud(width=1600, height=800,background_color='black', colormap='Pastel1',collocations=False).generate(str(text))
plt.figure( figsize=(20,10), facecolor='k')
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
from sklearn.preprocessing import LabelEncoder
Encoder = LabelEncoder()
df_tr['Sent2']= Encoder.fit_transform(df_tr['Sentiment'])
df_tr
| UserName | ScreenName | Location | TweetAt | OriginalTweet | Sentiment | text_wt | text_SW | lemma | lemma2 | Sent2 | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 3799 | 48751 | London | 16-03-2020 | menyrbie phil gahan chrisitv https t co i... | Neutral | [menyrbie, phil, gahan, chrisitv, https, t, co... | [menyrbie, phil, gahan, chrisitv, https, co, i... | [menyrbie, phil, gahan, chrisitv, http, co, if... | menyrbie phil gahan chrisitv http co ifzfanpa ... | 3 |
| 1 | 3800 | 48752 | UK | 16-03-2020 | advice talk to your neighbours family to excha... | Positive | [advice, talk, to, your, neighbours, family, t... | [advice, talk, neighbours, family, exchange, p... | [advice, talk, neighbour, family, exchange, ph... | advice talk neighbour family exchange phone nu... | 4 |
| 2 | 3801 | 48753 | Vagabonds | 16-03-2020 | coronavirus australia woolworths to give elde... | Positive | [coronavirus, australia, woolworths, to, give,... | [coronavirus, australia, woolworths, give, eld... | [coronavirus, australia, woolworth, give, elde... | coronavirus australia woolworth give elderly d... | 4 |
| 3 | 3802 | 48754 | 16-03-2020 | my food stock is not the only one which is emp... | Positive | [my, food, stock, is, not, the, only, one, whi... | [food, stock, one, empty, please, panic, enoug... | [food, stock, one, empty, please, panic, enoug... | food stock one empty please panic enough food ... | 4 | |
| 4 | 3803 | 48755 | 16-03-2020 | me ready to go at supermarket during the cov... | Extremely Negative | [me, ready, to, go, at, supermarket, during, t... | [ready, go, supermarket, covid, outbreak, para... | [ready, go, supermarket, covid, outbreak, para... | ready go supermarket covid outbreak paranoid f... | 0 | |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 41152 | 44951 | 89903 | Wellington City, New Zealand | 14-04-2020 | airline pilots offering to stock supermarket s... | Neutral | [airline, pilots, offering, to, stock, superma... | [airline, pilots, offering, stock, supermarket... | [airline, pilot, offering, stock, supermarket,... | airline pilot offering stock supermarket shelf... | 3 |
| 41153 | 44952 | 89904 | 14-04-2020 | response to complaint not provided citing covi... | Extremely Negative | [response, to, complaint, not, provided, citin... | [response, complaint, provided, citing, covid,... | [response, complaint, provided, citing, covid,... | response complaint provided citing covid relat... | 0 | |
| 41154 | 44953 | 89905 | 14-04-2020 | you know itâs getting tough when kameronwild... | Positive | [you, know, itâs, getting, tough, when, kamer... | [know, itâs, getting, tough, kameronwilds, ra... | [know, itâs, getting, tough, kameronwilds, ra... | know itâs getting tough kameronwilds rationin... | 4 | |
| 41155 | 44954 | 89906 | 14-04-2020 | is it wrong that the smell of hand sanitizer i... | Neutral | [is, it, wrong, that, the, smell, of, hand, sa... | [wrong, smell, hand, sanitizer, starting, turn... | [wrong, smell, hand, sanitizer, starting, turn... | wrong smell hand sanitizer starting turn coron... | 3 | |
| 41156 | 44955 | 89907 | i love you so much || he/him | 14-04-2020 | tartiicat well new used rift s are going for ... | Negative | [tartiicat, well, new, used, rift, s, are, goi... | [tartiicat, well, new, used, rift, going, amaz... | [tartiicat, well, new, used, rift, going, amaz... | tartiicat well new used rift going amazon rn a... | 2 |
41157 rows × 11 columns
sns.countplot(x='Sent2', data=df_tr)
<AxesSubplot:xlabel='Sent2', ylabel='count'>
from sklearn.model_selection import train_test_split
Train_X, Test_X, Train_Y, Test_Y= train_test_split(BOW,df_tr['Sent2'],test_size=0.3)
pd.DataFrame(Test_X, columns=cv.get_feature_names()).head()
| ab | abc | abcnews | ability | able | absolute | absolutely | absurd | abt | abuse | ... | youâ | yoy | yr | yâ | zealand | zero | zimbabwe | zombie | zone | zoom | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 5000 columns
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression()
lr.fit(Train_X,Train_Y)
log_pred=lr.predict(Test_X)
from sklearn import metrics
print(metrics.classification_report(log_pred, Test_Y))
# Use accuracy_score function to get the accuracy
print("Accuracy: Logistic Regression-> ",round(metrics.accuracy_score(log_pred, Test_Y)*100,2),"%")
C:\Users\SLIM5\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:762: ConvergenceWarning:
lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
precision recall f1-score support
0 0.60 0.63 0.62 1560
1 0.64 0.69 0.67 1810
2 0.52 0.54 0.53 2862
3 0.73 0.65 0.69 2675
4 0.58 0.57 0.58 3441
accuracy 0.61 12348
macro avg 0.62 0.62 0.62 12348
weighted avg 0.61 0.61 0.61 12348
Accuracy: Logistic Regression-> 60.77 %
from sklearn.metrics import plot_confusion_matrix
plot_confusion_matrix(lr, Test_X, Test_Y, display_labels=["EN","EP","N","Neu","P"])
plt.show()